if(!requireNamespace("MSTExplorer", quietly = TRUE)){
remotes::install_github("neurogenomics/MSTExplorer", dependencies = TRUE)
}
if(!requireNamespace("echodata", quietly = TRUE)){
remotes::install_github("RajLabMSSM/echodata", dependencies = TRUE)
}
knitr::opts_chunk$set(warning = FALSE,
cache = TRUE,
message = FALSE)
Here, we wanted to map CSL areas of interest onto the Human Phenotype Ontology (HPO) and/or other disease ontologies (OMIM/ORPH). This allowed us to connect our phenotype-cell type association results to the CSL areas of interest. For a description of our methodology, see here:
Identification of cell type-specific gene targets underlying thousands of rare diseases and subtraits Kitty B. Murphy, Robert Gordon-Smith, Jai Chapman, Momoko Otani, Brian M. Schilder, Nathan G. Skene https://www.medrxiv.org/content/10.1101/2023.02.13.23285820v1
First, we:
Gathered all diseases/phenotypes that CSL lists on their website and/or the Research Accelerator Initiative materials.
For each disease/phenotype, we assigned a Group based on whether CSL has expressed interest in Early Stage Partnering with external labs (e.g. via the RAI) or whether they are already actively pursuing research in these fields.
We then grouped each disease/phenotype into a broader Area (“Immunology”) and a particular disease Subarea (“Dermatomyositis”).
Next, we mapped each disease/phenotype onto a standardised HPO ID or OMIM/Orphanet ID.
csl <- data.table::fread(here::here("data/CSL_areas_of_interest.tsv"))
# csl <- csl[Group=="Early Stage Partnering"]
# extract IDs from the Entry column of csl data.table with the pattern "HP:", "OMIM:", "ORPHA:" and put into a new col
csl[, ID := stringr::str_extract(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+")]
# extract names from Entry column WITHOUT the ID
csl[, Name := trimws(stringr::str_remove(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+"))]
MSTExplorer::create_dt(csl)
Next, I took all the HPO IDs and expanded this list to any HPO terms that are descendants of the original HPO IDs. This allows us to capture any phenotypes that are more specific than the original CSL HPO IDs.
hpo_ids <- csl[grepl("HP:",ID)]$ID |> unique()
disease_ids <- csl[!grepl("HP:",ID)]$ID |> unique()
hpo <- KGExplorer::get_ontology("hp")
hpo_ids_list <- lapply(stats::setNames(hpo_ids,hpo_ids),
function(x) simona::dag_offspring(dag = hpo,
include_self = TRUE,
term = x))
hpo_ids_extended <- unlist(hpo_ids_list) |> unique()
Next, I imported the results of our phenotype-cell type association
analysis. This analysis was performed using the MSTExplorer
package and the results are stored in a data.table.
results <- MSTExplorer::load_example_results()
results <- HPOExplorer::add_hpo_name(results, hpo = hpo)
results <- HPOExplorer::add_ont_lvl(results)
results <- HPOExplorer::add_ancestor(results, hpo = hpo)
results <- MSTExplorer::map_celltype(results)
MSTExplorer::add_logfc(results)
results[,effect:=estimate]
p2g <- HPOExplorer::load_phenotype_to_genes()
We developed a pipeline to help filter down our results to identif the promise promising gene targets for the most severe phenotypes. It includes a number of different steps, including filtering on cell type specific gene expression and the evidence supporting the causal relationships between each gene and a phenotype.
Ultimately, it allows us to trace multi-scale disease mechanisms from genes –> cell types –> phenotypes –> diseases
prioritise_targets_out <- MSTExplorer::prioritise_targets(
results = results,
hpo = hpo,
phenotype_to_genes = p2g,
keep_deaths=NULL,
keep_onsets=NULL,
keep_specificity_quantiles = seq(30,40), ## NULL:70, 30-40:64
keep_mean_exp_quantiles = seq(30,40), ## NULL:65, 10:55
info_content_threshold=8, ## 8:55, 5:64
effect_threshold=NULL, ## 1:39
severity_score_gpt_threshold=NULL, ## 10:78, NULL:82
symptom_intersection_threshold=.25, ## .25:57
evidence_score_threshold=3, ## 5:47, 4:47, 3:64
top_n = 10, ## 5:38, 20:42, 30:45, 40:52, 50:55
group_vars = "hpo_id")
all_targets <- prioritise_targets_out$top_targets
targets <- all_targets[
(hpo_id %in% hpo_ids_extended
| disease_id %in% disease_ids
)
]
MSTExplorer::create_dt(head(targets,100))
Here we summarise the percentage of CSL phenotypes of interest (with HPO IDs) included in our prioritised cell type-specific targets. We also compute the proportion of CSL diseases (with OMIM/Orphanet IDs) that have at least one of those phenotypes as a symptom.
message(paste0(
length(intersect(results$hpo_id,hpo_ids_extended)),"/",
length(unique(hpo_ids_extended)),
" (",round(length(intersect(hpo_ids_extended,results$hpo_id))/
length(unique(hpo_ids_extended))*100,2),"%)",
" CSL phenotypes",
" covered in our phenotype-cell type association results."
))
## 1166/1860 (62.69%) CSL phenotypes covered in our phenotype-cell type association results.
message(paste0(
length(intersect(all_targets$hpo_id,hpo_ids_extended)),"/",
length(unique(hpo_ids_extended)),
" (",round(length(intersect(hpo_ids_extended,all_targets$hpo_id))/
length(unique(hpo_ids_extended))*100,2),"%)",
" CSL phenotypes (across ",length(unique(all_targets$disease_id))," diseases)",
" covered in our prioritised targets."
))
## 594/1860 (31.94%) CSL phenotypes (across 4850 diseases) covered in our prioritised targets.
From our list of prioritised cell type-specific gene targets, we can now select the top targets for each CSL area of interest.
We have arbitrarily set a cutoff of 3 targets per area, but this can easily be adjusted as needed.
top_n <- 3
# top_targets_ancestor <- targets[!duplicated(ancestor_name),]
csl_areas <- unique(csl[,c("ID","Area","Subarea")])
targets2 <- targets |>
merge(csl_areas,
all.x=TRUE,
by.x="hpo_id",by.y="ID") |>
merge(csl_areas,
all.x=TRUE,
by.x="disease_id",by.y="ID")
targets2[,Area:=data.table::fcoalesce(Area.x,Area.y)]
targets2[,Subarea:=data.table::fcoalesce(Subarea.x,Subarea.y)]
#### Select top N targets per CSL Area ####
num_cols <- sapply(targets2,is.numeric)
top_targets <- targets2[!is.na(Area), head(.SD,top_n),
by=c("Area")]
# drop list columns
# save_path <- here::here("reports/top_targets_CSL.tsv")
# top_targets[,-names(top_targets)[sapply(top_targets,is.list)],with=FALSE] |>
# data.table::fwrite(save_path, sep="\t")
# top_targets <- data.table::fread(here::here("top_targets_CSL.tsv"))
MSTExplorer::create_dt(top_targets)
We can also count the number of targets, genes, phenotypes and diseases per area of interest.
target_counts <- targets2[,list(n_targets=.N,
n_genes=data.table::uniqueN(gene_symbol),
n_phenotypes=data.table::uniqueN(hpo_id),
n_diseases=data.table::uniqueN(disease_id)
),by=c("Area","Subarea")][!is.na(Area)]|>
data.table::setorderv("n_targets",-1)
MSTExplorer::create_dt(target_counts)
Now that we have the top candidate targets for CSL disease areas, we can explore them in more detail. For example, we can use additional resources (ClinVar, Variant Effect Predictor, gnomad) to find out which variants are deleterious within (or around) the gene targets. We can also gathered population-level data on the frequency of these variants, giving us a better idea of the number of patients that may benefit from treating this particular mechanism.
Let’s first explore phenotypes related to “Stroke”.
As an example, we’re going to look specifically at the role of COL4A1 in stroke.
stroke_targets <- targets2[grepl("stroke",hpo_name, ignore.case = TRUE)]
stroke_targets <- stroke_targets[gene_symbol=="COL4A1"]
We gathered additional data from the gnomad website on the COL4A1 gene (i.e. ENSG00000187498).
Our goal here was to:
Identify confirmed pathogenic variants in COL4A1.
Check the frequency of these variants in the general population.
Identify a particular exon that has the highest frequency of pathogenic variants. The idea being that this exon could be a good target for therapeutic intervention via ASO-induced exon-skipping.
Additional info:
Exon 3 spans the following genomic coordinates: chr13:110213926-110214015
#### Import ####
gnomad <- data.table::fread(here::here("data/gnomAD_v4.0.0_ENSG00000187498_2024_02_08_11_13_20.csv"),
check.names = TRUE)
#### Add exon information ####
db <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
ex.gn <- GenomicFeatures::exonsBy(db, "gene")
ex <- ex.gn[["1282"]]
ex$exon_width <- GenomicRanges::width(ex)
# txs <- GenomicFeatures::transcriptsBy(db, by = "gene")
# tx.gn <- GenomicFeatures::exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene, by = "tx")
# tx <- tx.gn[["1282"]]
#### Convert to GRanges ####
gn <- echodata::dt_to_granges(gnomad,
chrom_col = "Chromosome",
start_col = "Position",
style="UCSC") |>
## Merge gnomad with exon data
IRanges::mergeByOverlaps(ex)
gn$ex=NULL
gn$`echodata::dt_to_granges(gnomad, chrom_col = "Chromosome", start_col = "Position", `=NULL
gn <- data.table::as.data.table(gn)
gn[,exon_variants_n:=data.table::uniqueN(gnomAD.ID),by="exon_id"]
gn[,exon_variants_perbp:=data.table::uniqueN(gnomAD.ID)/exon_width,by="exon_id"]
gn[,exon_pathovariants_perbp:=data.table::uniqueN(
gnomAD.ID[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20])/exon_width,by="exon_id"]
gn[,exon_pathovariants_cumfreq:=sum(
Allele.Frequency[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20]),by="exon_id"]
gn_top <-gn[exon_pathovariants_cumfreq %in% head(data.table::fsort(unique(exon_pathovariants_cumfreq),decreasing=TRUE),2)][
grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20,
]
gn_top[,head(.SD,1),by="exon_id",
.SDcols = c("exon_width",
"exon_variants_n",
"exon_variants_perbp",
"exon_pathovariants_perbp",
"exon_pathovariants_cumfreq")] |>
MSTExplorer::create_dt()
We found two exons with the highest cumulative frequency of pathogenic variants in the COL4A1 gene. Of these, exon 3 has the highest cumulative frequency of pathogenic variants.
Follow up research into the existing literature was carried out (not shown here), to confirm whether:
any similar therapies currently existing for this gene
exon 3 could be safely skipped
there are any animal models available for the homologous exons in this gene
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.3
## [2] matrixStats_1.2.0
## [3] bitops_1.0-7
## [4] httr_1.4.7
## [5] RColorBrewer_1.1-3
## [6] doParallel_1.0.17
## [7] tools_4.3.1
## [8] backports_1.4.1
## [9] utf8_1.2.4
## [10] R6_2.5.1
## [11] DT_0.32
## [12] lazyeval_0.2.2
## [13] GetoptLong_1.0.5
## [14] prettyunits_1.2.0
## [15] cli_3.6.2
## [16] Biobase_2.62.0
## [17] sass_0.4.8
## [18] readr_2.1.5
## [19] ewceData_1.10.0
## [20] Rsamtools_2.18.0
## [21] yulab.utils_0.1.4
## [22] R.utils_2.12.3
## [23] dichromat_2.0-0.1
## [24] orthogene_1.9.1
## [25] maps_3.4.2
## [26] limma_3.58.1
## [27] rstudioapi_0.15.0
## [28] RSQLite_2.3.5
## [29] pals_1.9
## [30] generics_0.1.3
## [31] gridGraphics_0.5-1
## [32] shape_1.4.6.1
## [33] BiocIO_1.12.0
## [34] gtools_3.9.5
## [35] crosstalk_1.2.1
## [36] car_3.1-2
## [37] dplyr_1.1.4
## [38] zip_2.3.1
## [39] homologene_1.4.68.19.3.27
## [40] Matrix_1.6-5
## [41] fansi_1.0.6
## [42] S4Vectors_0.40.2
## [43] abind_1.4-5
## [44] R.methodsS3_1.8.2
## [45] lifecycle_1.0.4
## [46] scatterplot3d_0.3-44
## [47] yaml_2.3.8
## [48] carData_3.0-5
## [49] SummarizedExperiment_1.32.0
## [50] gplots_3.1.3.1
## [51] SparseArray_1.2.4
## [52] BiocFileCache_2.10.1
## [53] grid_4.3.1
## [54] blob_1.2.4
## [55] promises_1.2.1
## [56] ExperimentHub_2.10.0
## [57] crayon_1.5.2
## [58] lattice_0.22-5
## [59] GenomicFeatures_1.54.4
## [60] chromote_0.2.0
## [61] KEGGREST_1.42.0
## [62] mapproj_1.2.11
## [63] pillar_1.9.0
## [64] knitr_1.45
## [65] ComplexHeatmap_2.18.0
## [66] KGExplorer_0.99.0
## [67] GenomicRanges_1.54.1
## [68] rjson_0.2.21
## [69] codetools_0.2-19
## [70] glue_1.7.0
## [71] ggfun_0.1.4
## [72] data.table_1.15.2
## [73] vctrs_0.6.5
## [74] png_0.1-8
## [75] treeio_1.26.0
## [76] gtable_0.3.4
## [77] HPOExplorer_1.0.0
## [78] cachem_1.0.8
## [79] xfun_0.42
## [80] openxlsx_4.2.5.2
## [81] S4Arrays_1.2.1
## [82] mime_0.12
## [83] tidygraph_1.3.1
## [84] SingleCellExperiment_1.24.0
## [85] RNOmni_1.0.1.2
## [86] iterators_1.0.14
## [87] simona_1.0.10
## [88] statmod_1.5.0
## [89] interactiveDisplayBase_1.40.0
## [90] ellipsis_0.3.2
## [91] nlme_3.1-164
## [92] ggtree_3.10.1
## [93] EWCE_1.11.3
## [94] bit64_4.0.5
## [95] progress_1.2.3
## [96] filelock_1.0.3
## [97] GenomeInfoDb_1.38.7
## [98] rprojroot_2.0.4
## [99] bslib_0.6.1
## [100] KernSmooth_2.23-22
## [101] colorspace_2.1-0
## [102] BiocGenerics_0.48.1
## [103] DBI_1.2.2
## [104] tidyselect_1.2.1
## [105] processx_3.8.3
## [106] bit_4.0.5
## [107] compiler_4.3.1
## [108] curl_5.2.1
## [109] rvest_1.0.4
## [110] httr2_1.0.0
## [111] xml2_1.3.6
## [112] DelayedArray_0.28.0
## [113] plotly_4.10.4
## [114] rtracklayer_1.62.0
## [115] scales_1.3.0
## [116] caTools_1.18.2
## [117] rappdirs_0.3.3
## [118] stringr_1.5.1
## [119] digest_0.6.35
## [120] piggyback_0.1.5
## [121] rmarkdown_2.26
## [122] XVector_0.42.0
## [123] htmltools_0.5.7
## [124] pkgconfig_2.0.3
## [125] GeneOverlap_1.38.0
## [126] MatrixGenerics_1.14.0
## [127] echodata_0.99.17
## [128] dbplyr_2.4.0
## [129] fastmap_1.1.1
## [130] rlang_1.1.3
## [131] GlobalOptions_0.1.2
## [132] htmlwidgets_1.6.4
## [133] shiny_1.8.0
## [134] jquerylib_0.1.4
## [135] jsonlite_1.8.8
## [136] BiocParallel_1.36.0
## [137] R.oo_1.26.0
## [138] RCurl_1.98-1.14
## [139] magrittr_2.0.3
## [140] GenomeInfoDbData_1.2.11
## [141] ggplotify_0.1.2
## [142] patchwork_1.2.0
## [143] munsell_0.5.0
## [144] Rcpp_1.0.12
## [145] ape_5.7-1
## [146] babelgene_22.9
## [147] stringi_1.8.3
## [148] zlibbioc_1.48.0
## [149] AnnotationHub_3.10.0
## [150] plyr_1.8.9
## [151] parallel_4.3.1
## [152] Biostrings_2.70.2
## [153] hms_1.1.3
## [154] circlize_0.4.16
## [155] ps_1.7.6
## [156] igraph_2.0.3
## [157] ggpubr_0.6.0
## [158] ggsignif_0.6.4
## [159] reshape2_1.4.4
## [160] biomaRt_2.58.2
## [161] stats4_4.3.1
## [162] gprofiler2_0.2.3
## [163] BiocVersion_3.18.1
## [164] XML_3.99-0.16.1
## [165] evaluate_0.23
## [166] BiocManager_1.30.22
## [167] tzdb_0.4.0
## [168] foreach_1.5.2
## [169] httpuv_1.6.14
## [170] rols_2.30.2
## [171] grr_0.9.5
## [172] tidyr_1.3.1
## [173] purrr_1.0.2
## [174] clue_0.3-65
## [175] ggplot2_3.5.0
## [176] broom_1.0.5
## [177] xtable_1.8-4
## [178] restfulr_0.0.15
## [179] tidytree_0.4.6
## [180] rstatix_0.7.2
## [181] later_1.3.2
## [182] viridisLite_0.4.2
## [183] MSTExplorer_1.0.0
## [184] Polychrome_1.5.1
## [185] TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0
## [186] tibble_3.2.1
## [187] websocket_1.4.1
## [188] aplot_0.2.2
## [189] GenomicAlignments_1.38.2
## [190] memoise_2.0.1.9000
## [191] AnnotationDbi_1.64.1
## [192] IRanges_2.36.0
## [193] cluster_2.1.6
## [194] HGNChelper_0.8.1
## [195] here_1.0.1